Author: Amalie Skålevåg, skalevag2@uni-potsdam.de

Introduction¶

Evaluating results of cluster analysis on automatically detected events, with my own event characteristics.

Modules¶

In [1]:
# modules
import random
from functools import partial
from pathlib import Path

import joblib
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.linalg
import scipy.stats
import shap
import sklearn.metrics
from IPython.display import Image
from matplotlib.backends.backend_pdf import PdfPages

import hysevt.clustering.metrics_clustering
import hysevt.events.metrics
from hysevt.utils import tools, visualise

Paths and directories¶

In [2]:
# set base directory
baseDIR = Path("..").resolve()
assert baseDIR.is_dir()

# data directory
dataDIR = "/home/skalevag/Documents/NRC_P8_water_energy_and_sediment/data"
dataDIR = Path(dataDIR)
localDataDIR = baseDIR.joinpath("data")

# station
station_name = "vent"
station_id = 201350  # station id for Vent-Rofental gauging station, as used by HD Tirol

# events
events_version = "automatic_local_minimum_events"
eventsDIR = localDataDIR.joinpath("events_database", station_name, events_version)
file_events = eventsDIR.joinpath(
    f"{station_id}_{station_name}_event_metrics.csv",
)

# results
resDIR = baseDIR.joinpath("results")
assert resDIR.is_dir()
figDIR = baseDIR.joinpath("figures")
assert figDIR.is_dir()

cluster_type = "metrics_clustering"
algorithm = "gaussian_mixture_model"
input_data = "reduced"
clusteringDIR = resDIR.joinpath(
    f"{events_version}_{station_name}", cluster_type, algorithm
)

Preprocessed data¶

In [3]:
annual_SSY = pd.read_csv(
    list(
        localDataDIR.joinpath("preprocessed").glob(
            f"{station_id}_{station_name}_annual_sediment_yield.csv"
        )
    )[0],
    index_col=0,
)

# files
file_data = localDataDIR.joinpath(
    "preprocessed", f"{station_id}_{station_name}_SSC_Q.csv"
)

# data
gauge_data = hysevt.events.metrics.get_gauge_data(file_data)
In [4]:
training_data = pd.read_csv(
    resDIR.joinpath(
        f"{events_version}_{station_name}",
        cluster_type,
        f"training_data_{input_data}.csv",
    ),
    index_col=0,
)
X_train = np.load(
    resDIR.joinpath(
        f"{events_version}_{station_name}", cluster_type, f"X_{input_data}.npy"
    )
)

training_features = tools.load_list_from_txt(
    resDIR.joinpath(
        f"{events_version}_{station_name}", cluster_type, "training_data_features.txt"
    )
)

assert X_train.shape[0] == len(training_data)
event_ids = training_data.index

Compare fits¶

In [5]:
runs = list(clusteringDIR.glob("run_20230615*"))

fit_results = {}
for runDIR in runs:
    models = list(runDIR.glob(f"*{input_data}*.joblib"))
    try:
        cov_type = tools.load_dict_from_json(runDIR.joinpath("settings.json"))[
            "covariance_type"
        ]
    except KeyError:
        continue
    print(cov_type)

    output = []
    for model in models:
        try:
            output.append(
                (
                    joblib.load(model).n_components,
                    joblib.load(model).bic(training_data),
                    sklearn.metrics.calinski_harabasz_score(
                        training_data, joblib.load(model).predict(training_data)
                    ),
                    sklearn.metrics.silhouette_score(
                        training_data, joblib.load(model).predict(training_data)
                    ),
                    joblib.load(model).covariances_.size
                    + joblib.load(model).means_.size
                    + joblib.load(model).weights_.size,
                )
            )
        except ValueError:
            output.append(
                (
                    joblib.load(model).n_components,
                    joblib.load(model).bic(training_data),
                    np.nan,
                    np.nan,
                    np.nan,
                )
            )
    fit_results[f"{runDIR.name}_{cov_type}"] = pd.DataFrame(
        output,
        columns=[
            "k",
            "BIC",
            "calinski_harabasz_score",
            "silhouette_score",
            "parameters",
        ],
    ).sort_values("k")
spherical
diag
full
tied
In [6]:
fig, ax = plt.subplots(nrows=3, figsize=(5, 7), sharex=True)

for i, score in enumerate(["BIC", "calinski_harabasz_score", "silhouette_score"]):
    for run in fit_results:
        if "tied" in run:
            continue
        fit_results[run].plot(
            x="k", y=score, label=run.split("_")[-1], ax=ax[i], legend=False
        )
        if i == 0:
            ax[i].legend(bbox_to_anchor=(1, 1), title="Covariance type")
    ax[i].set_ylabel(score)
    ax[i].set_xticks(fit_results[run]["k"])
In [7]:
fig, ax = plt.subplots()

for run in fit_results:
    fit_results[run].plot(
        x="k", y="parameters", label=run.split("_")[-1], ax=ax, legend=False
    )
    ax.legend(bbox_to_anchor=(1, 1))
ax.set_ylabel("number of fitted parameters")
ax.set_xticks(fit_results[run]["k"])
ax.hlines(
    training_data.shape[0],
    min(fit_results[run]["k"]),
    max(fit_results[run]["k"]),
    color="grey",
)
plt.xlim(2, 20)
Out[7]:
(2.0, 20.0)
In [8]:
k = 4

fig, ax = plt.subplots(nrows=2, sharex=True)

for i, score in enumerate(["BIC", "calinski_harabasz_score"]):
    sel_runs = []
    for run in fit_results:
        if "tied" in run:
            continue
        else:
            sel_runs.append(run)
        fit_results[run].plot(x="k", y=score, label=run, ax=ax[i], legend=False)
        if i == 0:
            ax[i].legend(bbox_to_anchor=(1, 1), title="Covariance type")
    ax[i].set_ylabel(score)
    ax[i].set_xticks(fit_results[run]["k"])
    ax[i].vlines(
        k,
        min([fit_results[run][score].min() for run in sel_runs]),
        max([fit_results[run][score].max() for run in sel_runs]),
        color="grey",
    )

plt.savefig(
    figDIR.joinpath(f"model_selection_{cluster_type}_{algorithm}_{input_data}.png"),
    dpi=300,
    bbox_inches="tight",
)

Selected model¶

In [9]:
if algorithm == "gaussian_mixture_model":
    print(list(clusteringDIR.glob("run_*")))
    run_id = "202306151133"  # clustering run id
    clusterDIR = clusteringDIR.joinpath(f"run_{run_id}")
else:
    clusterDIR = clusteringDIR
assert clusterDIR.is_dir()
clusterDIR
[PosixPath('/home/skalevag/Documents/NRC_P8_water_energy_and_sediment/projects/water_sediment_pulses/results/automatic_local_minimum_events_vent/metrics_clustering/gaussian_mixture_model/run_202306151133'), PosixPath('/home/skalevag/Documents/NRC_P8_water_energy_and_sediment/projects/water_sediment_pulses/results/automatic_local_minimum_events_vent/metrics_clustering/gaussian_mixture_model/run_202306151138'), PosixPath('/home/skalevag/Documents/NRC_P8_water_energy_and_sediment/projects/water_sediment_pulses/results/automatic_local_minimum_events_vent/metrics_clustering/gaussian_mixture_model/run_202304171429'), PosixPath('/home/skalevag/Documents/NRC_P8_water_energy_and_sediment/projects/water_sediment_pulses/results/automatic_local_minimum_events_vent/metrics_clustering/gaussian_mixture_model/run_202306151143'), PosixPath('/home/skalevag/Documents/NRC_P8_water_energy_and_sediment/projects/water_sediment_pulses/results/automatic_local_minimum_events_vent/metrics_clustering/gaussian_mixture_model/run_202306151142')]
Out[9]:
PosixPath('/home/skalevag/Documents/NRC_P8_water_energy_and_sediment/projects/water_sediment_pulses/results/automatic_local_minimum_events_vent/metrics_clustering/gaussian_mixture_model/run_202306151133')

The number of clusters may need to be evaluated for some clustering algorithms.

In [10]:
settings = tools.load_dict_from_json(clusterDIR.joinpath("settings.json"))
settings
Out[10]:
{'n_init': 100,
 'init_params': 'random_from_data',
 'verbose': 0,
 'covariance_type': 'spherical'}

For the test example the number of clusters were determined:

In [11]:
# number of clusters
k = 4
print(k)

# cluster colors
cluster_colors = plt.cm.rainbow(np.linspace(0, 1, k))
4
In [12]:
# save results from best clustering to subfolder
ODIR = clusterDIR.joinpath(f"best_result_{input_data}_{k}_clusters")
if not ODIR.is_dir():
    ODIR.mkdir()

Data¶

Events¶

In [13]:
event_metrics = pd.read_csv(file_events, index_col=0)
event_metrics.start = pd.to_datetime(event_metrics.start)
event_metrics.end = pd.to_datetime(event_metrics.end)
event_metrics = event_metrics.set_index("event_id")
event_metrics.head()
Out[13]:
start end duration seasonal_timing year SSY pSSY SSC_max SSC_mean SSC_mean_weighted ... Q_max_log Q_mean_log Q_max_previous_ratio time_since_last_event last_event_SSY_elapsed_time_logratio last_event_Qtotal_elapsed_time_logratio last_event_Q_max_elapsed_time_logratio last_event_SSC_max_elapsed_time_logratio SHI AHI
event_id
VEN-AUT-2006-001 2006-05-18 08:45:00 2006-05-19 09:45:00 25.00 138 2006 1069.088754 0.004492 3736.9221 1079.428631 1214.423695 ... 2.700690 2.270532 NaN NaN NaN NaN NaN NaN 0.034235 0.586656
VEN-AUT-2006-002 2006-05-28 10:15:00 2006-06-05 09:30:00 191.25 148 2006 1806.702578 0.007591 2508.0991 483.575956 641.000629 ... 2.523326 1.408162 0.177364 216.50 1.596971 8.310457 -2.676901 2.848427 -0.128928 0.447181
VEN-AUT-2006-003 2006-06-13 09:00:00 2006-06-14 08:45:00 23.75 164 2006 882.157443 0.003706 1874.1661 931.966941 1011.271536 ... 2.635480 2.312174 -0.112154 191.50 2.244371 9.596851 -2.731562 2.572393 0.037343 0.242092
VEN-AUT-2006-004 2006-06-14 09:00:00 2006-06-15 09:30:00 24.50 165 2006 1041.399567 0.004375 1799.2461 996.552409 1043.353769 ... 2.687847 2.416121 -0.052368 0.25 8.168665 15.065212 4.021774 8.922213 0.061433 0.032412
VEN-AUT-2006-005 2006-06-15 09:45:00 2006-06-16 09:00:00 23.25 166 2006 1144.288279 0.004808 1905.8293 1073.529931 1111.614591 ... 2.758109 2.498790 -0.070262 0.25 8.334615 15.199930 4.074142 8.881417 0.005221 -0.011580

5 rows × 35 columns

Preprocessed events¶

In [14]:
data = np.load(
    eventsDIR.joinpath(
        f"{station_id}_{station_name}_preprocessed_events_for_METS_clustering.npy"
    )
)
data_event_id = tools.load_list_from_txt(
    eventsDIR.joinpath(f"{station_id}_{station_name}_event_id_list.txt")
)
mask = np.array([ID in event_ids for ID in data_event_id])
data = data[mask]

Cluster results¶

In [15]:
model = joblib.load(
    clusterDIR.joinpath(f"{algorithm}_{k}_clusters_{input_data}_data.joblib")
)
if algorithm == "gaussian_mixture_model":
    y = model.predict(training_data)
    prob = model.predict_proba(training_data)
    cluster_prob = pd.DataFrame(
        prob, index=event_ids, columns=[f"prob_cluster_{c}" for c in range(k)]
    )
else:
    y = model.labels_
labels = pd.Series(y, index=event_ids).rename("cluster_id")
event_metrics_labels = event_metrics.join(labels)

Functions¶

In [16]:
help(hysevt.clustering.evaluation.calc_metric_zscores)
help(hysevt.clustering.evaluation.calc_cluster_zscore_averages)
help(hysevt.clustering.evaluation.calc_cluster_zscore_error_range)
Help on function calc_metric_zscores in module hysevt.clustering.evaluation:

calc_metric_zscores(metrics, cluster_labels, drop_cols=['start', 'end'])

Help on function calc_cluster_zscore_averages in module hysevt.clustering.evaluation:

calc_cluster_zscore_averages(metric_zscores, mode='median', cluster_id_col='cluster_id')

Help on function calc_cluster_zscore_error_range in module hysevt.clustering.evaluation:

calc_cluster_zscore_error_range(metric_zscores, error_range='full', cluster_id_col='cluster_id')

Magnitude of events¶

In [17]:
fig, ax = plt.subplots(figsize=(5, 4))
for c, color in zip(range(k), cluster_colors):

    event_metrics_labels[event_metrics_labels.cluster_id == c].plot.scatter(
        x="Qtotal",
        y="SSY",
        ax=ax,
        edgecolor=color,
        color="none",
        label=f"Cluster {c}",
        logx=False,
        logy=False,
    )
    # ax.set_title(f"Cluster {c} (n={(event_metrics_labels.cluster_id == c).sum()})")

ax.set_xlabel("Total streamflow volume [m3]")
ax.set_ylabel("Event SSY [t]")
ax.legend()

plt.savefig(
    ODIR.joinpath(f"best_cluster_number_{input_data}_scatter_SSY_Qtotal.png"),
    dpi=300,
    bbox_inches="tight",
)
In [18]:
fig, ax = plt.subplots(figsize=(9, 4), ncols=2, sharey=True)

for c, color in zip(range(k), cluster_colors):

    event_metrics_labels[event_metrics_labels.cluster_id == c].plot.scatter(
        x="Qtotal",
        y="SSY",
        ax=ax[0],
        edgecolor=color,
        color="none",
        label=f"Cluster {c}",
        logx=False,
        logy=False,
    )

ax[0].set_xlabel("Total streamflow volume [$m^3$]")
ax[0].set_ylabel("Event SSY [$t$]")
ax[0].legend()

for c, color in zip(range(k), cluster_colors):

    event_metrics_labels[event_metrics_labels.cluster_id == c].plot.scatter(
        x="seasonal_timing",
        y="SSY",
        ax=ax[1],
        edgecolor=color,
        color="none",
        label=f"Cluster {c}",
        legend=False,
    )

ax[1].set_xlabel("Day of year")
ax[1].set_ylabel("")

plt.tight_layout()

plt.savefig(
    ODIR.joinpath(f"best_cluster_number_{input_data}_scatter.png"),
    dpi=300,
    bbox_inches="tight",
)

Seasonality of clusters¶

In [19]:
fig, ax = plt.subplots(figsize=(5, 4))
for c, color in zip(range(k), cluster_colors):

    event_metrics_labels[event_metrics_labels.cluster_id == c].plot.scatter(
        x="seasonal_timing",
        y="SSY",
        ax=ax,
        edgecolor=color,
        color="none",
        label=f"Cluster {c}",
    )
    # ax.set_title(f"Cluster {c} (n={(event_metrics_labels.cluster_id == c).sum()})")

ax.set_xlabel("Day of year")
ax.legend()

plt.savefig(
    ODIR.joinpath(f"best_cluster_number_{input_data}_scatter_SSY_DOY.png"),
    dpi=300,
    bbox_inches="tight",
)
In [20]:
for c, color in zip(range(k), cluster_colors):
    fig, ax = plt.subplots()
    event_metrics_labels[event_metrics_labels.cluster_id != c].plot.scatter(
        x="seasonal_timing", y="SSY", color="grey", alpha=0.2, ax=ax
    )
    event_metrics_labels[event_metrics_labels.cluster_id == c].plot.scatter(
        x="seasonal_timing", y="SSY", ax=ax, color=color
    )
    ax.set_title(f"Cluster {c} (n={(event_metrics_labels.cluster_id == c).sum()})")
    plt.savefig(
        ODIR.joinpath(f"best_cluster_number_{input_data}_scatter_cluster{c}.png"),
        dpi=300,
        bbox_inches="tight",
    )

GMM cluster probabilities¶

In [21]:
event_metrics_cluster_prob = event_metrics.join(cluster_prob)
training_data_cluster_prob = training_data.join(cluster_prob)
In [22]:
def plot_cluster_prob(features_with_cluster_prob, k, x, y, x_label=None, y_label=None):
    seq_cmaps = [
        "Purples",
        "Blues",
        "Greens",
        "Oranges",
        "Reds",
        "YlOrBr",
        "YlOrRd",
        "OrRd",
        "PuRd",
        "RdPu",
        "BuPu",
        "GnBu",
        "PuBu",
        "YlGnBu",
        "PuBuGn",
        "BuGn",
        "YlGn",
    ]

    if x_label is None:
        x_label = x
    if y_label is None:
        y_label = y

    fig, ax = plt.subplots(
        ncols=k // 2, nrows=2, figsize=(2.5 * k, 3 * 2), sharex=True, sharey=True
    )
    ax = ax.ravel()

    for c, cmap in zip(range(k), seq_cmaps[:k]):
        features_with_cluster_prob.plot.scatter(
            ax=ax[c],
            x=x,
            y=y,
            c=f"prob_cluster_{c}",
            cmap=cmap,
            vmin=0,
            vmax=1,
            alpha=0.5,
            edgecolor="none",
        )
        ax[c].set_title(f"Cluster {c}")
        ax[c].set_ylabel("")
    fig.text(0.5, -0.03, x_label, fontsize=18)
    fig.text(-0.03, 0.5, y_label, fontsize=18, rotation="vertical")
    plt.tight_layout()
In [23]:
x = "Qtotal"
y = "SSY"
plot_cluster_prob(event_metrics_cluster_prob, k, x=x, y=y)
plt.savefig(
    ODIR.joinpath(f"best_cluster_number_{input_data}_cluster_prob_{x}_vs_{y}.png"),
    dpi=300,
    bbox_inches="tight",
)

x = "SHI"
y = "SSY"
plot_cluster_prob(event_metrics_cluster_prob, k, x=x, y=y)
plt.savefig(
    ODIR.joinpath(f"best_cluster_number_{input_data}_cluster_prob_{x}_vs_{y}.png"),
    dpi=300,
    bbox_inches="tight",
)

x = "seasonal_timing"
y = "SSY"
plot_cluster_prob(event_metrics_cluster_prob, k, x=x, y=y)
plt.savefig(
    ODIR.joinpath(f"best_cluster_number_{input_data}_cluster_prob_{x}_vs_{y}.png"),
    dpi=300,
    bbox_inches="tight",
)

if input_data == "reduced":
    x = "PC1"
    y = "PC2"
    plot_cluster_prob(training_data_cluster_prob, k, x=x, y=y)
    plt.savefig(
        ODIR.joinpath(f"best_cluster_number_{input_data}_cluster_prob_{x}_vs_{y}.png"),
        dpi=300,
        bbox_inches="tight",
    )
    x = "PC1"
    y = "PC3"
    plot_cluster_prob(training_data_cluster_prob, k, x=x, y=y)
    plt.savefig(
        ODIR.joinpath(f"best_cluster_number_{input_data}_cluster_prob_{x}_vs_{y}.png"),
        dpi=300,
        bbox_inches="tight",
    )
    x = "PC1"
    y = "PC4"
    plot_cluster_prob(training_data_cluster_prob, k, x=x, y=y)
    plt.savefig(
        ODIR.joinpath(f"best_cluster_number_{input_data}_cluster_prob_{x}_vs_{y}.png"),
        dpi=300,
        bbox_inches="tight",
    )

Event response pattern and hysteresis of each cluster¶

Representative events for each cluster¶

In [24]:
cluster_representative = []
for c in range(k):
    # get event with highest probability
    event_id = random.choice(
        list(cluster_prob[cluster_prob[f"prob_cluster_{c}"] > 0.9].index)
    )
    cluster_representative.append(event_id)
cluster_representative
Out[24]:
['VEN-AUT-2015-037',
 'VEN-AUT-2006-075',
 'VEN-AUT-2009-030',
 'VEN-AUT-2007-085']
In [25]:
fig, ax = plt.subplots(nrows=2, ncols=k, figsize=(k * 3, 4), sharey=False)
for c in range(k):
    event = event_metrics.loc[cluster_representative[c]]
    event_series = hysevt.events.metrics.get_event_data(
        event.start, event.end, gauge_data
    )

    visualise.plotEventSeries(event_series, ax=ax[0][c])
    visualise.plotEventHysteresis(event_series, ax=ax[1][c])
    # ax[0][c].set_title(f"{event.start:%Y-%m-%d},duration: {event.duration} h")
    ax[0][c].set_title(f"Cluster {c}: Example", fontsize=16)
    ax[0][c].set_xticks((ax[0][c].get_xticks()[0], ax[0][c].get_xticks()[-1]))
plt.tight_layout()
plt.savefig(
    ODIR.joinpath(
        f"example_events_with_hysteresis_{'_'.join(cluster_representative)}_EGUposter.png"
    ),
    dpi=600,
    bbox_inches="tight",
)

All cluster events to PDF¶

In [26]:
for c in range(k):
    if ODIR.joinpath(f"cluster{c}_event_hystersis_plots.pdf").is_file():
        continue
    with PdfPages(ODIR.joinpath(f"cluster{c}_event_hystersis_plots.pdf")) as pdf:
        for event_id in labels[labels == c].index.to_list():
            fig, ax = plt.subplots(ncols=2, figsize=(8, 3), sharey=False)
            event = event_metrics.loc[event_id]
            event_series = hysevt.events.metrics.get_event_data(
                event.start, event.end, gauge_data
            )

            visualise.plotEventSeries(event_series, ax=ax[0])
            visualise.plotEventHysteresis(event_series, ax=ax[1])
            ax[0].set_title(
                f"{event_id}, {event.start:%Y-%m-%d %H:%M}, {event.duration:.0f} hours"
            )
            fig.suptitle(
                f"Probability cluster {c}: {event_metrics_cluster_prob[f'prob_cluster_{c}'][event_id]:.2f}",
                color=cluster_colors[c],
            )
            plt.tight_layout()
            pdf.savefig()
            plt.close()

Explore model fit and parameters¶

Table¶

In [27]:
gmm_parameters = (
    pd.DataFrame(
        model.means_,
        columns=[f"Mean_{col}" for col in training_data.columns],
        index=[f"Cluster{c}" for c in range(k)],
    )
    .join(
        pd.DataFrame(
            model.covariances_,
            columns=["Variance"],
            index=[f"Cluster{c}" for c in range(k)],
        )
    )
    .join(
        pd.DataFrame(
            model.weights_, columns=["Weight"], index=[f"Cluster{c}" for c in range(k)]
        )
    )
)
gmm_parameters
Out[27]:
Mean_PC1 Mean_PC2 Mean_PC3 Mean_PC4 Mean_PC5 Mean_PC6 Mean_PC7 Variance Weight
Cluster0 -3.451822 0.741523 0.577830 0.234283 0.281182 0.150651 -0.142495 1.032032 0.112743
Cluster1 1.898625 0.136341 -0.325492 0.613008 0.537588 -0.253704 -0.145181 0.657455 0.199729
Cluster2 -0.779855 -0.302831 -0.445886 0.027444 -0.328718 0.054580 0.073223 0.645718 0.469115
Cluster3 1.720595 0.142985 0.957063 -0.740445 0.069288 0.037007 0.049044 3.643744 0.218413
In [28]:
gmm_parameters.to_csv(ODIR.joinpath("GMM_parameters.csv"))

Plot¶

In [29]:
help(visualise.plot_gmm)
Help on function plot_gmm in module hysevt.utils.visualise:

plot_gmm(X, Y_, means, covariances, cluster_colors, alpha=0, no_ticks=False, ax=None, xy=None, e=3)

In [30]:
dim = len(training_data.columns)

with PdfPages(ODIR.joinpath(f"GMM_cluster_mean_variance.pdf")) as pdf:
    for i in range(dim):
        for j in range(i + 1, dim):
            visualise.plot_gmm(
                X_train,
                model.predict(X_train),
                model.means_[:, [i, j]],
                model.covariances_,
                cluster_colors=cluster_colors,
                xy=(i, j),
                alpha=0.2,
                e=(X_train[:, [i, j]].max() - X_train[:, [i, j]].min()) / 7,
            )
            plt.xlabel(f"PC{i+1}")
            plt.ylabel(f"PC{j+1}")
            pdf.savefig()
X does not have valid feature names, but GaussianMixture was fitted with feature names
X does not have valid feature names, but GaussianMixture was fitted with feature names
X does not have valid feature names, but GaussianMixture was fitted with feature names
X does not have valid feature names, but GaussianMixture was fitted with feature names
X does not have valid feature names, but GaussianMixture was fitted with feature names
X does not have valid feature names, but GaussianMixture was fitted with feature names
X does not have valid feature names, but GaussianMixture was fitted with feature names
X does not have valid feature names, but GaussianMixture was fitted with feature names
X does not have valid feature names, but GaussianMixture was fitted with feature names
X does not have valid feature names, but GaussianMixture was fitted with feature names
X does not have valid feature names, but GaussianMixture was fitted with feature names
X does not have valid feature names, but GaussianMixture was fitted with feature names
X does not have valid feature names, but GaussianMixture was fitted with feature names
X does not have valid feature names, but GaussianMixture was fitted with feature names
X does not have valid feature names, but GaussianMixture was fitted with feature names
X does not have valid feature names, but GaussianMixture was fitted with feature names
X does not have valid feature names, but GaussianMixture was fitted with feature names
X does not have valid feature names, but GaussianMixture was fitted with feature names
X does not have valid feature names, but GaussianMixture was fitted with feature names
X does not have valid feature names, but GaussianMixture was fitted with feature names
X does not have valid feature names, but GaussianMixture was fitted with feature names
More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).

Z-Scores and event shapes¶

In [31]:
event_metrics.drop(columns=["start", "end"]).hist(figsize=(10, 10))
plt.tight_layout()
In [32]:
# calculate zscores
event_metrics_zscores = event_metrics.drop(columns=["start", "end"]).apply(
    partial(scipy.stats.zscore, nan_policy="omit"), axis=0
)
# remove some columns and add cluster labels
event_metrics_zscores = event_metrics_zscores.drop(
    columns=[
        "year",
        "pSSY",
        "pQtotal",
        "n_SSC_peaks",
        "n_Q_peaks",
        "last_event_Qtotal_elapsed_time_logratio",
        "time_since_last_event",
    ]
).join(
    labels
)  # add cluster labels
cluster_zscores_average = hysevt.clustering.evaluation.calc_cluster_zscore_averages(
    event_metrics_zscores, mode="median"
)
In [33]:
event_metrics_zscores.hist(figsize=(10, 10))
plt.tight_layout()
In [34]:
event_metrics_zscores.to_csv(
    ODIR.joinpath(f"best_cluster_number_{input_data}_results_zscores.csv")
)
cluster_zscores_average.to_csv(
    ODIR.joinpath(f"best_cluster_number_{input_data}_results_zscore_averages.csv")
)
In [35]:
em_col_order = [
    "seasonal_timing",
    "duration",
    "SSY_falling_to_rising_ratio",
    "SQPR",
    "peak_phase_diff",
    "SHI",
    "AHI",
    "last_event_SSY_elapsed_time_logratio",
    "Q_max_previous_ratio",
    "SSY",
    "SSC_max",
    "SSC_mean_weighted",
    "SSC_mean",
    "Q_mean",
    "Q_max",
    "Qtotal",
]
em_col_order_labels = [
    r"$DOY$",
    r"$\Delta t$",
    r"$SSY_{ratio}$",
    r"$SQPR$",
    r"$phi_{peak}$",
    r"$SHI$",
    r"$AHI$",
    r"$IEI$",
    r"$Q_{peak,ratio}$",
    r"$SSY$",
    r"$SSC_{max}$",
    r"$SSC_{mean,w}$",
    r"$SSC_{mean}$",
    r"$Q_{mean}$",
    r"$Q_{max}$",
    r"$Q_{total}$",
]
em_colors = [
    "darkslategrey",
    "darkslategrey",
    "green",
    "green",
    "green",
    "green",
    "green",
    "darkgoldenrod",
    "darkgoldenrod",
    "brown",
    "brown",
    "brown",
    "brown",
    "blue",
    "blue",
    "blue",
]
In [39]:
ax = visualise.violinplot_cluster_zscore(
    event_metrics_zscores,
    k,
    col_order=em_col_order,
    col_order_labels=em_col_order_labels,
    col_colors=em_colors,
    cluster_colors=cluster_colors,
)
plt.savefig(
    ODIR.joinpath(f"event_metrics_zscore_violins.png"),
    dpi=600,
    bbox_inches="tight",
)
In [40]:
fig, ax = plt.subplots(
    ncols=k,
    figsize=(3 * k, len(em_col_order) / 3),
    sharey=True,
    sharex=True,
)
for c in range(k):
    visualise.plot_cluster_zscore_averages(
        cluster_zscores_average.loc[:, em_col_order],
        c,
        ax=ax[c],
        color=em_colors,
    )
    ax[c].set_title(f"Cluster {c}")
ax[0].set_yticks(ax[0].get_yticks(), em_col_order_labels)
plt.tight_layout()
plt.savefig(
    ODIR.joinpath(f"best_cluster_number_{input_data}_zscores_averages.png"),
    dpi=300,
    bbox_inches="tight",
)
In [41]:
lower, upper = hysevt.clustering.evaluation.calc_cluster_zscore_error_range(
    event_metrics_zscores, error_range=(0.25, 0.75)
)

fig, ax = plt.subplots(
    ncols=k,
    figsize=(3 * k, len(em_col_order) / 4),
    sharey=True,
    sharex=True,
)
for c in range(k):
    visualise.plot_cluster_zscore_averages_with_error_range(
        cluster_zscores_average.loc[:, em_col_order],
        lower.loc[:, em_col_order],
        upper.loc[:, em_col_order],
        c,
        ax=ax[c],
        color=em_colors,
        error_color="lightgrey",
    )
    ax[c].set_title(f"Cluster {c}", fontsize=16)

ax[0].set_xlim(-1.2, 1.8)
ax[0].set_xticks((-1, 1))
ax[0].set_yticks(ax[0].get_yticks(), em_col_order_labels)
plt.tight_layout()
plt.savefig(
    ODIR.joinpath(
        f"best_cluster_number_{input_data}_zscores_averages_error_bars_EGUposter.png"
    ),
    dpi=600,
    bbox_inches="tight",
)
In [42]:
with PdfPages(ODIR.joinpath(f"best_cluster_number_results_{input_data}.pdf")) as pdf:
    visualise.plot_cluster_result_with_zscore_averages(
        data, labels.values, cluster_zscores_average
    )
    plt.savefig(
        ODIR.joinpath(
            f"best_cluster_number_results_{input_data}_with_zscore_averages.png"
        ),
        dpi=300,
        bbox_inches="tight",
    )
    pdf.savefig()
    visualise.plot_cluster_zscores(event_metrics_zscores)
    plt.savefig(
        ODIR.joinpath(f"best_cluster_number_{input_data}_zscores.png"),
        dpi=300,
        bbox_inches="tight",
    )
    pdf.savefig()
    fig_list = visualise.quickplot_clustering_results(event_metrics.join(labels))
    [pdf.savefig(fig) for fig in fig_list]
To output multiple subplots, the figure containing the passed axes is being cleared.
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
Input In [42], in <cell line: 1>()
     14 plt.savefig(
     15     ODIR.joinpath(f"best_cluster_number_{input_data}_zscores.png"),
     16     dpi=300,
     17     bbox_inches="tight",
     18 )
     19 pdf.savefig()
---> 20 fig_list = visualise.quickplot_clustering_results(event_metrics.join(labels))
     21 [pdf.savefig(fig) for fig in fig_list]

File ~/Documents/NRC_P8_water_energy_and_sediment/projects/water_sediment_pulses/src/hysevt/hysevt/utils/visualise.py:320, in quickplot_clustering_results(clustering_results, with_year_column)
    316 fig, ax = plt.subplots(ncols=2)
    317 clustering_results.boxplot(
    318     column="last_event_SSY_elapsed_time_logratio", by="cluster_id", grid=False, color="grey", ax=ax[0]
    319 )
--> 320 clustering_results.boxplot(
    321     column="SSC_to_Q_peak_logratio", by="cluster_id", grid=False, color="grey", ax=ax[1]
    322 )
    323 plt.tight_layout()
    324 fig_list.append(fig)

File ~/anaconda3/envs/pulses/lib/python3.8/site-packages/pandas/plotting/_core.py:511, in boxplot_frame(self, column, by, ax, fontsize, rot, grid, figsize, layout, return_type, backend, **kwargs)
    494 @Substitution(backend=_backend_doc)
    495 @Appender(_boxplot_doc)
    496 def boxplot_frame(
   (...)
    508     **kwargs,
    509 ):
    510     plot_backend = _get_plot_backend(backend)
--> 511     return plot_backend.boxplot_frame(
    512         self,
    513         column=column,
    514         by=by,
    515         ax=ax,
    516         fontsize=fontsize,
    517         rot=rot,
    518         grid=grid,
    519         figsize=figsize,
    520         layout=layout,
    521         return_type=return_type,
    522         **kwargs,
    523     )

File ~/anaconda3/envs/pulses/lib/python3.8/site-packages/pandas/plotting/_matplotlib/boxplot.py:425, in boxplot_frame(self, column, by, ax, fontsize, rot, grid, figsize, layout, return_type, **kwds)
    410 def boxplot_frame(
    411     self,
    412     column=None,
   (...)
    421     **kwds,
    422 ):
    423     import matplotlib.pyplot as plt
--> 425     ax = boxplot(
    426         self,
    427         column=column,
    428         by=by,
    429         ax=ax,
    430         fontsize=fontsize,
    431         grid=grid,
    432         rot=rot,
    433         figsize=figsize,
    434         layout=layout,
    435         return_type=return_type,
    436         **kwds,
    437     )
    438     plt.draw_if_interactive()
    439     return ax

File ~/anaconda3/envs/pulses/lib/python3.8/site-packages/pandas/plotting/_matplotlib/boxplot.py:372, in boxplot(data, column, by, ax, fontsize, rot, grid, figsize, layout, return_type, **kwds)
    367         columns = [column]
    369 if by is not None:
    370     # Prefer array return type for 2-D plots to match the subplot layout
    371     # https://github.com/pandas-dev/pandas/pull/12216#issuecomment-241175580
--> 372     result = _grouped_plot_by_column(
    373         plot_group,
    374         data,
    375         columns=columns,
    376         by=by,
    377         grid=grid,
    378         figsize=figsize,
    379         ax=ax,
    380         layout=layout,
    381         return_type=return_type,
    382     )
    383 else:
    384     if return_type is None:

File ~/anaconda3/envs/pulses/lib/python3.8/site-packages/pandas/plotting/_matplotlib/boxplot.py:249, in _grouped_plot_by_column(plotf, data, columns, by, numeric_only, grid, figsize, ax, layout, return_type, **kwargs)
    247 for i, col in enumerate(columns):
    248     ax = _axes[i]
--> 249     gp_col = grouped[col]
    250     keys, values = zip(*gp_col)
    251     re_plotf = plotf(keys, values, ax, **kwargs)

File ~/anaconda3/envs/pulses/lib/python3.8/site-packages/pandas/core/groupby/generic.py:1338, in DataFrameGroupBy.__getitem__(self, key)
   1329 if isinstance(key, tuple) and len(key) > 1:
   1330     # if len == 1, then it becomes a SeriesGroupBy and this is actually
   1331     # valid syntax, so don't raise warning
   1332     warnings.warn(
   1333         "Indexing with multiple keys (implicitly converted to a tuple "
   1334         "of keys) will be deprecated, use a list instead.",
   1335         FutureWarning,
   1336         stacklevel=find_stack_level(),
   1337     )
-> 1338 return super().__getitem__(key)

File ~/anaconda3/envs/pulses/lib/python3.8/site-packages/pandas/core/base.py:250, in SelectionMixin.__getitem__(self, key)
    248 else:
    249     if key not in self.obj:
--> 250         raise KeyError(f"Column not found: {key}")
    251     subset = self.obj[key]
    252     ndim = subset.ndim

KeyError: 'Column not found: SSC_to_Q_peak_logratio'

Event type shapes and hysteresis¶

In [ ]:
# plot just event shapes
fig, ax = plt.subplots(
    ncols=k,
    nrows=3,
    figsize=(3 * k, 3 * 3),
    sharey=True,
    sharex=False,
)
for c in range(k):
    visualise.plot_single_cluster(data, labels.values, c, ax=ax[:, c], ylabels=c == 0)

plt.savefig(ODIR.joinpath("event_cluster_shape_and_hysteresis.png"), dpi=600)
In [ ]:
# plot just event shapes
fig, ax = plt.subplots(
    ncols=k,
    nrows=3,
    figsize=(2 * k, 1.5 * 3),
    sharey=True,
    sharex=False,
)
for c in range(k):
    visualise.plot_single_cluster(data, labels.values, c, ax=ax[:, c], ylabels=False)

ax[1, 1].set_xlabel(" " * 50 + "Standardised event length")
ax[0, 0].set_ylabel("Q (standardised)", color="blue")
ax[1, 0].set_ylabel("SSC (standardised)", color="brown")
ax[1, 0].set_yticks((0, 1))
for a in ax[:2, :].ravel():
    a.set_xticks((0, 49))
    a.set_xticklabels((1, 50))
ax[1, 0].set_ylim(0, 1)
plt.tight_layout()
# plt.savefig(ODIR.joinpath("event_cluster_shape_EGUposter.png"), dpi=600)
In [ ]:
fig, ax = plt.subplots(
    ncols=k,
    figsize=(3 * k, 3),
    sharey=True,
    sharex=False,
)
alpha = 0.1

for c in range(k):
    cluster_average = np.median(data[labels == c], axis=0)
    for e in range((labels == c).sum()):
        ax[c].scatter(
            x=data[labels == c, 0, :][e],
            y=data[labels == c, 1, :][e],
            c=range(50),
            cmap="copper",
            s=30,
            alpha=alpha,
            edgecolor="none",
        )
    ax[c].plot(cluster_average[0], cluster_average[1], color="black")
plt.savefig(ODIR.joinpath("event_cluster_hysteresis.png"), dpi=600)

Catchment metrics¶

All events¶

In [43]:
catchment_water_metrics = pd.read_csv(
    eventsDIR.joinpath("event_catchment_water_metrics.csv"), index_col=0
)
catchment_energy_metrics = pd.read_csv(
    eventsDIR.joinpath("event_catchment_energy_metrics.csv"), index_col=0
)
In [47]:
catchment_water_metrics_zscores = hysevt.clustering.evaluation.calc_metric_zscores(
    catchment_water_metrics,
    labels,
    drop_cols=[
        "start",
        "end",
        "pre_start",
        "NAPI7_SPARTACUS",
        "NAPI7_INCA",
    ],
)
catchment_water_metrics_zscores_average = (
    hysevt.clustering.evaluation.calc_cluster_zscore_averages(
        catchment_water_metrics_zscores
    )
)

(
    catchment_water_metrics_zscores_lower,
    catchment_water_metrics_zscores_upper,
) = hysevt.clustering.evaluation.calc_cluster_zscore_error_range(
    catchment_water_metrics_zscores, error_range=(0.25, 0.75)
)
In [49]:
catchment_energy_metrics_zscores = hysevt.clustering.evaluation.calc_metric_zscores(
    catchment_energy_metrics,
    labels,
    drop_cols=[
        "start",
        "end",
        "event_Tmean_INCA",
        "event_Tmax_INCA",
        "ATmean5_INCA",
        "ATmean7_INCA",
        "ATmean14_INCA",
        "ATmean5_SPARTACUS",
        "ATmean7_SPARTACUS",
        "ATmean14_SPARTACUS",
        "AFI5_INCA",
        "ATI5_INCA",
        "AFI7_INCA",
        "ATI7_INCA",
        "AFI7_SPARTACUS",
        "ATI7_SPARTACUS",
        "AFI14_INCA",
        "ATI14_INCA",
        "AFI30_INCA",
        "ATI30_INCA",
        "AFI14_SPARTACUS",
        "ATI14_SPARTACUS",
        "AFI30_SPARTACUS",
        "AFI5_SPARTACUS",
    ],
)
catchment_energy_metrics_zscores_average = (
    hysevt.clustering.evaluation.calc_cluster_zscore_averages(
        catchment_energy_metrics_zscores, mode="mean"
    )
)

(
    catchment_energy_metrics_zscores_lower,
    catchment_energy_metrics_zscores_upper,
) = hysevt.clustering.evaluation.calc_cluster_zscore_error_range(
    catchment_energy_metrics_zscores, error_range=(0.1, 0.9)
)
In [50]:
catchment_metrics_zscores_average = catchment_energy_metrics_zscores_average.join(
    catchment_water_metrics_zscores_average
)

catchment_metrics_zscores_lower = catchment_energy_metrics_zscores_lower.join(
    catchment_water_metrics_zscores_lower
)
catchment_metrics_zscores_upper = catchment_energy_metrics_zscores_upper.join(
    catchment_water_metrics_zscores_upper
)
In [51]:
col_order = [
    "fSCA",
    "ACDA_SPARTACUS",
    "event_global_radiation",
    "event_Tmean_SPARTACUS",
    "event_Tmax_SPARTACUS",
    "antecedent_global_radiation",
    "ATI5_SPARTACUS",
    "ATI30_SPARTACUS",
    "SCA_change",
    "event_precip_sum_SPARTACUS",
    "event_precip_sum_INCA",
    "event_max_precip_intensity_SPARTACUS",
    "event_max_precip_intensity_INCA",
    "NAPI5_SPARTACUS",
    "NAPI14_SPARTACUS",
]

col_order_labels = [
    "fSCA",
    "ACDA",
    "$GR_{event}$",
    "$T_{mean}$",
    "$T_{max}$",
    "AGR",
    "ATI5",
    "ATI30",
    "SCA_change",
    "$P_{total}$ (daily)",
    "$P_{total}$ (hourly)",
    "$I_{max}$ (daily)",
    "$I_{max}$ (hourly)",
    "NAPI5",
    "NAPI14",
]

colors = [
    "grey",
    "grey",
    "hotpink",
    "red",
    "maroon",
    "salmon",
    "darksalmon",
    "darksalmon",
    "aqua",
    "blue",
    "blue",
    "darkblue",
    "darkblue",
    "slategrey",
    "slategrey",
]
In [61]:
ax = visualise.violinplot_cluster_zscore(
    event_metrics_zscores=catchment_energy_metrics_zscores.join(
        catchment_water_metrics_zscores.drop(columns=["cluster_id"])
    ),
    k=k,
    col_order=col_order,
    col_colors=colors,
    col_order_labels=col_order_labels,
    cluster_colors=cluster_colors,
)

plt.savefig(
    ODIR.joinpath(f"catchment_metrics_zscore_violins.png"),
    dpi=600,
    bbox_inches="tight",
)
In [52]:
fig, ax = plt.subplots(
    ncols=k,
    figsize=(3 * k, catchment_metrics_zscores_average.shape[1] / 3),
    sharey=True,
    sharex=True,
)

for c in range(k):
    visualise.plot_cluster_zscore_averages(
        catchment_metrics_zscores_average.loc[:, col_order],
        c,
        ax=ax[c],
        color=colors,
    )
    ax[c].set_title(f"Cluster {c}")
ax[0].set_yticks(ax[0].get_yticks(), col_order_labels)
plt.tight_layout()
plt.savefig(
    ODIR.joinpath(
        f"best_cluster_number_{input_data}_results_with_catchment_zscores.png"
    ),
    dpi=300,
)
In [53]:
fig, ax = plt.subplots(
    ncols=k,
    figsize=(3 * k, catchment_metrics_zscores_average.shape[1] / 3),
    sharey=True,
    sharex=True,
)

for c in range(k):
    visualise.plot_cluster_zscore_averages_with_error_range(
        catchment_metrics_zscores_average.loc[:, col_order],
        catchment_metrics_zscores_lower.loc[:, col_order],
        catchment_metrics_zscores_upper.loc[:, col_order],
        c,
        ax=ax[c],
        color=colors,
        set_xlim_to_global=False,
    )
    ax[c].set_title(f"Cluster {c}")
ax[0].set_yticks(ax[0].get_yticks(), col_order_labels)
plt.tight_layout()
plt.savefig(
    ODIR.joinpath(
        f"best_cluster_number_{input_data}_results_with_catchment_zscores_with_IQRerror_bars_version1.png"
    ),
    dpi=300,
)
In [54]:
fig, ax = plt.subplots(
    ncols=k,
    figsize=(3 * k, catchment_metrics_zscores_average.shape[1] / 3),
    sharey=True,
    sharex=True,
)

for c in range(k):
    visualise.plot_cluster_zscore_averages_with_error_range(
        catchment_metrics_zscores_average.loc[:, col_order],
        catchment_metrics_zscores_lower.loc[:, col_order],
        catchment_metrics_zscores_upper.loc[:, col_order],
        c,
        ax=ax[c],
        color=colors,
        set_xlim_to_global=True,
    )
    ax[c].set_title(f"Cluster {c}")
ax[0].set_yticks(ax[0].get_yticks(), col_order_labels)
plt.tight_layout()
plt.savefig(
    ODIR.joinpath(
        f"best_cluster_number_{input_data}_results_with_catchment_zscores_with_IQRerror_bars_version2.png"
    ),
    dpi=300,
)
In [55]:
fig, ax = plt.subplots(
    ncols=k,
    figsize=(3 * k, catchment_metrics_zscores_average.shape[1] / 4),
    sharey=True,
    sharex=True,
)

for c in range(k):
    visualise.plot_cluster_zscore_averages_with_error_range(
        catchment_metrics_zscores_average.loc[:, col_order],
        catchment_metrics_zscores_lower.loc[:, col_order],
        catchment_metrics_zscores_upper.loc[:, col_order],
        c,
        ax=ax[c],
        color=colors,
        set_xlim_to_global=False,
        error_color="lightgrey",
    )
    ax[c].set_title(f"Cluster {c}", fontsize=16)
ax[0].set_yticks(ax[0].get_yticks(), col_order_labels)
ax[0].set_xlim(-1.2, 1.2)
ax[0].set_xticks((-1.5, 1))
plt.tight_layout()
plt.savefig(
    ODIR.joinpath(
        f"best_cluster_number_{input_data}_results_with_catchment_zscores_with_IQRerror_bars_EGUposter.png"
    ),
    dpi=300,
)

Representative events¶

In [62]:
cluster_representative
Out[62]:
['VEN-AUT-2015-037',
 'VEN-AUT-2006-075',
 'VEN-AUT-2009-030',
 'VEN-AUT-2007-085']

Actual values¶

In [63]:
catchment_energy_metrics.drop(columns=["start", "end"]).join(
    catchment_water_metrics.drop(columns=["start", "end", "pre_start"])
).loc[cluster_representative, col_order].plot.bar(
    color=colors, figsize=(20, 5), legend=False
)
plt.legend(bbox_to_anchor=(1, 1), loc="upper left")
Out[63]:
<matplotlib.legend.Legend at 0x7fe1301feb20>

Z-Scores¶

In [64]:
cluster_representative
Out[64]:
['VEN-AUT-2015-037',
 'VEN-AUT-2006-075',
 'VEN-AUT-2009-030',
 'VEN-AUT-2007-085']
In [65]:
fig, ax = plt.subplots(
    figsize=(6, 10),
)
catchment_energy_metrics_zscores.join(
    catchment_water_metrics_zscores.drop(columns=["cluster_id"])
).loc[cluster_representative, col_order].plot.barh(
    ax=ax, color=colors, legend=False, label="cluster_id"
)
plt.yticks(
    range(len(cluster_representative)),
    [f"Cluster{i}" for i in range(len(cluster_representative))],
)
handles, llabels = ax.get_legend_handles_labels()
ax.legend(handles[::-1], llabels[::-1], bbox_to_anchor=(1, 1), loc="upper left")
Out[65]:
<matplotlib.legend.Legend at 0x7fe11bd5e820>

Metric histograms¶

In [66]:
all_metrics = (
    catchment_energy_metrics.drop(columns=["start", "end"])
    .join(catchment_water_metrics.drop(columns=["start", "end", "pre_start"]))[
        col_order
    ]
    .join(
        event_metrics_labels[
            [
                "duration",
                "seasonal_timing",
                "SSY",
                "SSC_max",
                "SSC_mean_weighted",
                "Qtotal",
                "Q_max",
                "Q_mean",
                "SSY_falling_to_rising_ratio",
                "Q_max_previous_ratio",
                "last_event_SSC_max_elapsed_time_logratio",
                "peak_phase_diff",
                "SQPR",
                "SHI",
                "AHI",
                "cluster_id",
            ]
        ]
    )
)
In [67]:
fig, ax = plt.subplots(
    ncols=5, nrows=int(len(all_metrics.columns.to_list()[:-1]) / 5), figsize=(15, 15)
)
ax = ax.ravel()
for i, m in enumerate(all_metrics.columns.to_list()[:-1]):
    all_metrics.pivot(columns="cluster_id")[m].drop(columns=[np.nan]).plot(
        kind="kde",
        stacked=True,
        color=cluster_colors,
        ax=ax[i],
        legend=False,
        logx=m
        in [
            "SSY",
            "SSC_max",
            "SSC_mean_weighted",
            "Qtotal",
            "Q_max",
            "Q_mean",
        ],
    )
    ax[i].set_xlim(all_metrics[m].min(), all_metrics[m].max())
    ax[i].set_title(m)

plt.tight_layout()
ax[4].legend(bbox_to_anchor=(1, 1), loc="upper left", title="Cluster")
plt.savefig(
    ODIR.joinpath(f"cluster_{input_data}_results_all_metrics_kde.png"),
    dpi=300,
    bbox_inches="tight",
)

SHAP¶

Making the clustering explainable with SHAP.

https://towardsdatascience.com/how-to-make-clustering-explainable-1582390476cc

In [68]:
explainer = shap.Explainer(model.predict, training_data)
shap_values = explainer(training_data)
In [69]:
# example event with highest cluster prob from each cluster
for c in range(k):
    # get event with highest probability
    event_id = random.choice(
        list(
            cluster_prob[
                cluster_prob[f"prob_cluster_{c}"]
                == cluster_prob[f"prob_cluster_{c}"].max()
            ].index
        )
    )
    i = np.where(training_data.index == event_id)[0][0]
    # plot
    plt.figure()
    shap.plots.waterfall(shap_values[i], show=False)
    plt.savefig(
        ODIR.joinpath(
            f"best_cluster_number_{input_data}_results_SHAP_cluster{c}_example_{event_id}.png"
        ),
        dpi=300,
        bbox_inches="tight",
    )
In [70]:
# all events
shap.plots.beeswarm(shap_values, show=False)
plt.savefig(
    ODIR.joinpath(f"best_cluster_number_{input_data}_results_SHAP.png"),
    dpi=300,
    bbox_inches="tight",
)
In [71]:
for c, train in training_data.join(labels).groupby("cluster_id"):
    shap_values = explainer(train.drop(columns=["cluster_id"]))
    plt.figure()
    shap.plots.beeswarm(shap_values, show=False)
    plt.savefig(
        ODIR.joinpath(f"best_cluster_number_{input_data}_results_SHAP_cluster{c}.png"),
        dpi=300,
        bbox_inches="tight",
    )

All metrics¶

In [72]:
size = (len(em_col_order) + len(col_order)) / 2
gs = mpl.gridspec.GridSpec(
    2, k, width_ratios=[1] * k, height_ratios=[len(em_col_order), len(col_order)]
)
fig = plt.figure(figsize=(size, 3 * size / k))
em_ax = np.array([fig.add_subplot(gs[c]) for c in range(k)])
cm_ax = np.array([fig.add_subplot(gs[c]) for c in range(k, k * 2)])

# event metrics
for c, sub in event_metrics_zscores.groupby("cluster_id"):
    for j, m in enumerate(em_col_order):
        violins = em_ax[int(c)].violinplot(
            sub.loc[:, m].dropna(),
            positions=[j + 1],
            vert=False,
            showmedians=False,
            showmeans=True,
            widths=0.9,
        )
        violins["bodies"][0].set_edgecolor(em_colors[j])
        violins["bodies"][0].set_facecolor(em_colors[j])
        violins["bodies"][0].set_alpha(0.7)
        for partname in ("cbars", "cmins", "cmaxes", "cmeans"):
            vp = violins[partname]
            vp.set_edgecolor(em_colors[j])
            vp.set_linewidth(2 if partname == "cmeans" else 1)

    em_ax[int(c)].vlines(
        0, 0, len(em_col_order) + 1, color="grey", alpha=0.5, linewidth=1
    )
    em_ax[int(c)].set_xlabel("Z-Score")
    em_ax[int(c)].set_title(
        f"Cluster{int(c)}", color=cluster_colors[int(c)], fontsize=16
    )
    em_ax[int(c)].set_yticks(range(1, len(em_col_order) + 1), [""] * len(em_col_order))

    em_ax[int(c)].set_ylim(0, len(em_col_order) + 1)
    em_ax[int(c)].set_xlim(-7, 7)
em_ax[0].set_yticks(range(1, len(em_col_order) + 1), em_col_order_labels)


# catchment metrics
for c, sub in catchment_energy_metrics_zscores.join(
    catchment_water_metrics_zscores.drop(columns=["cluster_id"])
).groupby("cluster_id"):

    for j, m in enumerate(col_order):
        violins = cm_ax[int(c)].violinplot(
            sub.loc[:, m].dropna(),
            positions=[j + 1],
            vert=False,
            showmedians=False,
            showmeans=True,
            widths=0.9,
        )
        violins["bodies"][0].set_edgecolor(colors[j])
        violins["bodies"][0].set_facecolor(colors[j])
        violins["bodies"][0].set_alpha(0.7)
        for partname in ("cbars", "cmins", "cmaxes", "cmeans"):
            vp = violins[partname]
            vp.set_edgecolor(colors[j])
            vp.set_linewidth(2 if partname == "cmeans" else 1)

    cm_ax[int(c)].vlines(0, 0, 16, color="grey", alpha=0.5, linewidth=1)
    cm_ax[int(c)].set_xlabel("Z-Score")
    cm_ax[int(c)].set_ylim(0, len(col_order) + 1)
    cm_ax[int(c)].set_yticks(range(1, len(col_order) + 1), [""] * len(col_order))
    cm_ax[int(c)].set_xlim(-7, 7)
cm_ax[0].set_yticks(range(1, len(col_order) + 1), col_order_labels)

plt.tight_layout()

plt.savefig(
    ODIR.joinpath(f"event_catchment_metrics_violinplots.png"),
    dpi=300,
    bbox_inches="tight",
)

Event type proportion of annual sediment yield¶

In [78]:
event_type_yield = (
    event_metrics_labels.groupby(["year", "cluster_id"], dropna=True)
    .SSY.sum()
    .unstack()
)
event_type_yield.columns = [f"type_{int(col)}" for col in event_type_yield.columns]
annual_SSY_event_type = annual_SSY.join(
    event_type_yield.sum(axis=1).rename("total_event_yield")
)
annual_SSY_event_type["non_event_yield"] = (
    annual_SSY_event_type.sediment_yield - annual_SSY_event_type.total_event_yield
)
event_type_yield = event_type_yield.join(annual_SSY_event_type["non_event_yield"])
event_type_yield.to_csv(ODIR.joinpath("event_type_annual_sediment_yield.csv"))
In [79]:
# set new column names
event_type_yield.columns = [f"Cluster {c}" for c in range(k)] + [
    "non-events",
]
In [80]:
event_type_yield.divide(annual_SSY_event_type.sediment_yield, axis="rows").mean(axis=0)
Out[80]:
Cluster 0     0.242324
Cluster 1     0.062559
Cluster 2     0.359808
Cluster 3     0.131030
non-events    0.236588
dtype: float64
In [81]:
fig, ax = plt.subplots(figsize=(15, 5), ncols=2)
event_type_yield.plot.bar(
    stacked=True,
    ax=ax[0],
    color=np.vstack((cluster_colors, np.array(plt.cm.gray(0.5)).reshape(1, 4))),
)
ax[0].set_ylabel("Annual suspended sediment yield [t]")
event_type_yield.mean(axis=0).plot.pie(
    ax=ax[1],
    autopct="%1.1f%%",
    colors=np.vstack((cluster_colors, np.array(plt.cm.gray(0.5)).reshape(1, 4))),
    legend=False,
)
ax[0].legend(loc="upper left", bbox_to_anchor=(1, 1))
ax[1].set_ylabel("")
plt.savefig(
    ODIR.joinpath(
        f"best_cluster_number_{input_data}_annual_sediment_yield_event_types.png"
    ),
    dpi=600,
    bbox_inches="tight",
)
In [ ]: